library(phyloseq)
library(vegan)
Loading required package: permute
Loading required package: lattice
This is vegan 2.4-0
library(ade4)
Attaching package: ‘ade4’
The following object is masked from ‘package:vegan’:
cca
library(ggplot2)
library(ggthemes)
library(stringr)
library(tidyr)
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
load("../data/STAT.rdata")
distance() function to compute JSD distance of the normalized microbiome data.dist_jsd <- phyloseq::distance(phy, method = distanceMethodList$JSD)
str(dist_jsd)
Class 'dist' atomic [1:4560] 0.1539 0.0959 0.0971 0.0817 0.1112 ...
..- attr(*, "Labels")= chr [1:96] "cecal_C1" "cecal_C10" "cecal_C2" "cecal_C3" ...
..- attr(*, "Size")= int 96
..- attr(*, "call")= language as.dist.default(m = DistMat)
..- attr(*, "Diag")= logi FALSE
..- attr(*, "Upper")= logi FALSE
plot(hclust(dist_jsd, "single"))
hclust(dist_jsd, "average") %>%
plot()
hclust(dist_jsd, "complete") %>%
plot()
Plot the dendrograms from 3.
Use rect.hclust function to produce best discrete clusters from the dendrograms.
hclust_single = hclust(dist_jsd, "single")
plot(hclust_single)
rect.hclust(hclust_single, k = 3)
hclust_average = hclust(dist_jsd, "average")
plot(hclust_average)
rect.hclust(hclust_average, k = 3)
hclust_complete = hclust(dist_jsd, "complete")
plot(hclust_complete)
rect.hclust(hclust_complete, k = 3)
plot(cophenetic(hclust_single), dist_jsd)
plot(dist_jsd, cophenetic(hclust_average))
plot(dist_jsd, cophenetic(hclust_complete))
How well do the cophenetic distances represent the original distances? Compute correlations.
Use Partitioning around Medoids, pam, to cluster the JSD distances into two clusters.
pam_jsd <- cluster::pam(dist_jsd, 2)
str(pam_jsd)
List of 9
$ medoids : chr [1:2] "cecal_P3" "fecal_C2"
$ id.med : int [1:2] 14 53
$ clustering: Named int [1:96] 1 1 1 1 1 1 1 1 1 1 ...
..- attr(*, "names")= chr [1:96] "cecal_C1" "cecal_C10" "cecal_C2" "cecal_C3" ...
$ objective : Named num [1:2] 0.133 0.123
..- attr(*, "names")= chr [1:2] "build" "swap"
$ isolation : Factor w/ 3 levels "no","L","L*": 1 1
..- attr(*, "names")= chr [1:2] "1" "2"
$ clusinfo : num [1:2, 1:5] 49 47 0.372 0.275 0.129 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : NULL
.. ..$ : chr [1:5] "size" "max_diss" "av_diss" "diameter" ...
$ silinfo :List of 3
..$ widths : num [1:96, 1:3] 1 1 1 1 1 1 1 1 1 1 ...
.. ..- attr(*, "dimnames")=List of 2
.. .. ..$ : chr [1:96] "cecal_V10" "cecal_VP7" "cecal_VP8" "cecal_C10" ...
.. .. ..$ : chr [1:3] "cluster" "neighbor" "sil_width"
..$ clus.avg.widths: num [1:2] 0.377 0.424
..$ avg.width : num 0.4
$ diss : NULL
$ call : language cluster::pam(x = dist_jsd, k = 2)
- attr(*, "class")= chr [1:2] "pam" "partition"
# plot(pam_jsd)
table(sample_data(phy)$Treatment, pam_jsd$clustering)
1 2
C 10 10
P 10 9
T 10 9
V 10 10
VP 9 9
table(sample_data(phy)$Location, pam_jsd$clustering)
1 2
cecal 49 1
fecal 0 46
library(clusterSim)
cluster.tr = table(sample_data(phy)$Treatment, pam_jsd$clustering)
chisq.test(cluster.tr)
Pearson's Chi-squared test
data: cluster.tr
X-squared = 0.063624, df = 4, p-value = 0.9995
cluster.loc = table(sample_data(phy)$Location, pam_jsd$clustering)
chisq.test(cluster.loc)
Pearson's Chi-squared test with Yates' continuity correction
data: cluster.loc
X-squared = 88.198, df = 1, p-value < 2.2e-16
pam1 <- function(x,k) list(cluster = pam(as.dist(x),k, cluster.only=TRUE))
gsPam1 <- clusGap(as.matrix(dist_jsd), FUN = pam1, K.max = 20, B = 100)
Clustering k = 1,2,..., K.max (= 20): .. done
Bootstrapping, b = 1,2,..., B (= 100) [one "." per sample]:
.................................................. 50
.................................................. 100
par(mfrow=c(1,1))
plot(gsPam1)
gsPam1$Tab %>%
as.data.frame() %>%
ggplot(aes(x = 1:20, y = gap)) +
geom_line() +
geom_errorbar(aes(ymin = gap - 2*SE.sim, ymax = gap + 2*SE.sim),
colour = "red") +
geom_errorbar(aes(ymin = gap - SE.sim, ymax = gap + SE.sim),
colour = "black", linetype = "dashed")
Install and load the following packages: “randomForest”, “kernlab”, “ROCR”.
Use the provided 6-fold cross validation functions (svm.kfoldAUC and rf.kfoldAUC) to estimate prediction accuracy in 100 repetitions using Phylum level data for:
A. Location; B. Antibiotic vs. Control within fecal samples; C. Antibiotic vs. Control within cecal samples.
Alternatively, use the “caret” package to accomplish the same.
source("../labs/Lab-07-Clustering-and-Classification/CV_protocols.R")
Attaching package: ‘kernlab’
The following object is masked from ‘package:ggplot2’:
alpha
The following object is masked from ‘package:permute’:
how
Loading required package: gplots
Attaching package: ‘gplots’
The following object is masked from ‘package:stats’:
lowess
randomForest 4.6-12
Type rfNews() to see new features/changes/bug fixes.
Attaching package: ‘randomForest’
The following object is masked from ‘package:dplyr’:
combine
The following object is masked from ‘package:ggplot2’:
margin
runInParallel = TRUE
if(runInParallel){
library("doParallel")
registerDoParallel(cl = parallel::detectCores() - 1L)
}
Loading required package: foreach
foreach: simple, scalable parallel programming from Revolution Analytics
Use Revolution R for scalability, fault tolerance and more.
http://www.revolutionanalytics.com
Loading required package: iterators
Loading required package: parallel
preds <- subset_samples(phy_phy, Location == 'cecal') %>%
otu_table() %>%
t() %>%
as.matrix()
resp <- subset_samples(phy_phy, Location == 'cecal') %>%
sample_data() %>%
.[["Treatment"]] %>%
str_replace_all("^(?!C).*$", "Antibiotic") %>%
str_replace_all("^C", "Control")
resp
[1] "Control" "Control" "Control" "Control" "Control" "Control" "Control" "Control" "Control" "Control"
[11] "Antibiotic" "Antibiotic" "Antibiotic" "Antibiotic" "Antibiotic" "Antibiotic" "Antibiotic" "Antibiotic" "Antibiotic" "Antibiotic"
[21] "Antibiotic" "Antibiotic" "Antibiotic" "Antibiotic" "Antibiotic" "Antibiotic" "Antibiotic" "Antibiotic" "Antibiotic" "Antibiotic"
[31] "Antibiotic" "Antibiotic" "Antibiotic" "Antibiotic" "Antibiotic" "Antibiotic" "Antibiotic" "Antibiotic" "Antibiotic" "Antibiotic"
[41] "Antibiotic" "Antibiotic" "Antibiotic" "Antibiotic" "Antibiotic" "Antibiotic" "Antibiotic" "Antibiotic" "Antibiotic" "Antibiotic"
sample_df <- data.frame(sample_data(phy_phy))
head(sample_df)
X.SampleID BarcodeSequence LinkerPrimerSequence Treatment Location Description
cecal_C1 cecal_C1 NA NA C cecal missing_description
cecal_C10 cecal_C10 NA NA C cecal missing_description
cecal_C2 cecal_C2 NA NA C cecal missing_description
cecal_C3 cecal_C3 NA NA C cecal missing_description
cecal_C4 cecal_C4 NA NA C cecal missing_description
cecal_C5 cecal_C5 NA NA C cecal missing_description
fitControl <- trainControl(## 10-fold CV
method = "repeatedcv",
number = 10,
classProbs = TRUE,
## repeated ten times
repeats = 10,
summaryFunction = twoClassSummary,
allowParallel = runInParallel)
fit1svmL <- caret::train(x = as.matrix(t(otu_table(phy_phy))),
y = sample_df$Location,
method = "svmLinear",
metric = "ROC",
trControl = fitControl,
verbose = FALSE)
fit1rf <- caret::train(x = as.matrix(t(otu_table(phy_phy))),
y = sample_df$Location,
metric = "ROC",
method = "rf",
trControl = fitControl,
verbose = FALSE)